Overview

There is mainly two parts to this story:

  1. A simple easy model of the key relationship

  2. A medium complexity smoothing analysis

The two data set used in this analysis are the Madison case and waste water concentration data.

##           Date    Site Cases    N1 N1Error
## 435 2021-06-19 Madison     3    NA      NA
## 436 2021-06-20 Madison     8 49443    8812
## 437 2021-06-21 Madison     1 90447   20429
## 438 2021-06-22 Madison     3 44587   12427
## 439 2021-06-23 Madison     4 14469    9155
## 440 2021-06-24 Madison    NA 28023    7724

A simple display of the data shows the core components of this story. First that both data sets are extremely noisy. And that there is a hint of a relationship between the two signals.

FirstImpression <- FullDF%>%
  NoNa("N1","Cases")%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=MinMaxNormalization(N1), color="N1",info=N1))+#compares N1 to Cases
  geom_line(aes(y=MinMaxNormalization(Cases), color="Cases",info=Cases))+
  labs(y="variable min max normalized")+
  ColorRule

ggplotly(FirstImpression,tooltip=c("info","Date"))

Cross correlation and Granger Causality are key component to this analysis. Cross correlation looks at the correlation at a range of time shifts and Granger analysis preform a test for predictive power. We find that there is to much noise to find significance.

TestDF1 <- FullDF%>%
  FillNA("N1","Cases")%>%#filling in missing data in range with previous values
  NoNa("N1","Cases")#removing rows from before both series started#filling in missing data in range with previous values


Cases <- TestDF1$Cases
N1 <- TestDF1$N1

ccf(Cases,N1,na.action=na.pass)

grangertest(Cases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(Cases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df     F Pr(>F)
## 1    279                
## 2    280 -1 2.623 0.1065
grangertest(N1,Cases, order = 1)
## Granger causality test
## 
## Model 1: Cases ~ Lags(Cases, 1:1) + Lags(N1, 1:1)
## Model 2: Cases ~ Lags(Cases, 1:1)
##   Res.Df Df      F Pr(>F)
## 1    279                 
## 2    280 -1 0.1124 0.7377
IntermediateOutlierGraphic <- TRUE #

DaySmoothed=21#Very wide smoothing to find where the data strong deviate from trend

FullDF2 <- FullDF%>%#Remove older data that clearly has no relationship to Cases
  mutate(N1 = ifelse(Date < mdy("11/20/2020"),NA,N1))

FullDF3 <- FullDF2%>%
  mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median, 
                            na.r = TRUE,fill=NA),#Finding very smooth version of the data with no outliers
         SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1),#Fixing issue where rollapply fills NA on right border
         LargeError=N1>5*SmoothN1,#Calculating error Limits
         N1=ifelse(LargeError,SmoothN1,N1))%>%#replacing data points that variance is to large
  select(-SmoothN1,-LargeError)#Removing unneeded calculated columns

if(IntermediateOutlierGraphic){
  OutlierGraphic <- FullDF%>%
    mutate(SmoothN1=rollapply(data = N1, width = DaySmoothed, FUN = median, 
                            na.r = TRUE,fill=NA),#creating smooth data
           SmoothN1=ifelse(is.na(SmoothN1),N1,SmoothN1))%>%#Fixing issue where rollapply fills NA on right border)%>%
    mutate(Outliers=Date < mdy("11/20/2020")|N1>5*SmoothN1)%>%
    NoNa("N1","Cases")%>%#Removing outliers
    ggplot(aes(x=Date))+#Data depends on time
    geom_point(aes(y=MinMaxNormalization(Cases), color="Cases",info=Cases))+
    geom_point(aes(y=MinMaxNormalization(N1), color="N1",shape=Outliers,info=N1))+#compares N1 to Cases
    labs(y="variable min max normalized")
  ggplotly(OutlierGraphic,tooltip=c("info","Date"))
}


Ref <- FullDF3%>%
  NoNa("N1","Cases")%>%#Removing outliers
  ggplot(aes(x=Date))+#Data depends on time
  geom_line(aes(y=MinMaxNormalization(N1), color="N1", info = N1))+#compares N1 to Cases
  geom_line(aes(y=MinMaxNormalization(Cases), color="Cases",info = Cases))+
  labs(y="variable min max normalized")+
  ColorRule

ggplotly(Ref,tooltip=c("info","Date"))
TestDF2 <- FullDF3%>%
  FillNA("N1","Cases")%>%#filling in missing data in range with previous values
  NoNa("N1","Cases")#removing rows from before both series started


Cases <- TestDF2$Cases
N1 <- TestDF2$N1

#library(tseries) Tests for stationarity. potential use is obvious
# kpss.test(Cases)
# adf.test(Cases)
# kpss.test(N1)
# adf.test(N1)

ccf(Cases,N1,na.action=na.pass)

grangertest(Cases, N1, order = 1)
grangertest(N1,Cases, order = 1)

A key component to this relationship is that the relationship between N1 and Case involves a gamma distribution modeling both the time between catching Covid-19 and getting a test and the concentration of the shedded particles. We found a gamma distribution with mean 11.73 days and a standard deviation of 7.68 match’s other research and gives good results.

SLDWidth <- 21
scale  <- 5.028338
shape  <- 2.332779 #These parameters are equivalent to the mean and sd above

weights <- dgamma(1:SLDWidth, scale = scale, shape = shape)
plot(weights,  
            main=paste("Gamma Distribution with mean = 11.73 days,and SD = 7.68"), 
            ylab = "Weight", 
            xlab = "Lag")

SLDSmoothedDF <- FullDF3%>%
  mutate(
    SLDCases = c(rep(NA,SLDWidth-1),#elimination of starting values not relevant as we have a 50+ day buffer of case data
                        rollapply(Cases,width=SLDWidth,FUN=weighted.mean,
                                  w=weights,
                                  na.rm = FALSE)))#no missing data to remove


SLDPlot = SLDSmoothedDF%>%
  NoNa("N1","SLDCases")%>%#same plot as earlier but with the SLD smoothing
  ggplot(aes(x=Date))+
  geom_line(aes(y=MinMaxNormalization(SLDCases), color="SLDCases",info = SLDCases))+
  geom_line(aes(y=MinMaxNormalization(N1), color="N1",info = N1))+
  facet_wrap(~Site)+
  labs(y="variable min max normalized")+
  ColorRule

ggplotly(SLDPlot,tooltip=c("info","Date"))

The SLD Cause a signfigent improvement in the relationship now showing a statisticly signifigent test result

TestDF3 <- SLDSmoothedDF%>%
  FillNA("N1","SLDCases")%>%#filling in missing data in range with previous values
  NoNa("N1","SLDCases")#removing rows from before both series started

SLDCases <- TestDF3$SLDCases
N1 <- TestDF3$N1

ccf(SLDCases,N1,na.action=na.pass)

grangertest(SLDCases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df      F   Pr(>F)   
## 1    279                      
## 2    280 -1 8.3701 0.004115 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
## 
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
##   Res.Df Df      F    Pr(>F)    
## 1    279                        
## 2    280 -1 16.616 5.974e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

To isolate this relationship we used a primitive binning relationship. This clarifies the relationship we see hints of in the previous graphic and masks the noise in the data.

StartDate <- 1 #Where the binning starts
DaySmoothing <- 14 #The size of the bins
Lag <- 7 #The offset between Cases and N1

BinDF <- SLDSmoothedDF%>%
  select(Date, Cases, N1)%>%
    mutate(MovedCases = data.table::shift(Cases, Lag),#moving  SLD lag days backwards
           Week=(as.numeric(Date)+StartDate)%/%DaySmoothing)%>%#putting variables into bins via integer division
  group_by(Week)%>%
  #filter(Week>2670)%>%
  summarise(BinnedCases=median(MovedCases, na.rm=TRUE), BinnedN1=(median((N1), na.rm=TRUE)), Date = mean(Date,na.rm = TRUE))#summarize data within bins.



BinedGraph <- BinDF%>%
  NoNa("BinnedN1","BinnedCases")%>%#Remove NA
  ggplot(aes(x=Date))+
  geom_line(aes(y=MinMaxNormalization(BinnedN1), color="N1" , info = BinnedN1))+
  geom_line(aes(y=MinMaxNormalization(BinnedCases), color="Cases", info = BinnedCases))+
  labs(y="Binned variable min max normalized")+
  ColorRule

ggplotly(BinedGraph,tooltip=c("info","Date"))
BinedGraph

BinedCorrGraph <- BinDF%>%
  ggplot()+
  geom_point(aes(x=BinnedCases, y=BinnedN1, info = Date))
ggplotly(BinedCorrGraph,tooltip=c("x","y","info"))
cor(BinDF$BinnedN1, BinDF$BinnedCases, use="pairwise.complete.obs")
## [1] 0.3365265
summary(lm(BinnedCases~BinnedN1, data=BinDF))
## 
## Call:
## lm(formula = BinnedCases ~ BinnedN1, data = BinDF)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89.54 -55.36 -31.51  40.55 240.47 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 5.637e+01  3.161e+01   1.783   0.0897 .
## BinnedN1    3.348e-04  2.095e-04   1.598   0.1257  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.12 on 20 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.1133, Adjusted R-squared:  0.06891 
## F-statistic: 2.554 on 1 and 20 DF,  p-value: 0.1257

To generate this relationship without reducing the amount of data we rely on a loess smoothing of the data. The loess smoothing is a way of generating smooth curves from noisy data. The displayed plot shows the visual power of this smoothing. We see a relationship in the big patterns but also multiple sub patterns match. We see in general that smoothed N1 both lags and leads the case data.

SLDSmoothedDF$loessN1 <- loessFit(y=(SLDSmoothedDF$N1), 
                      x=SLDSmoothedDF$Date, #create loess fit of the data
                      span=.2, #span of .2 seems to give the best result, not rigorously chosen
                      iterations=2)$fitted#2 iterations remove some bad patterns

SLDLoessGraphic <- SLDSmoothedDF%>%
  NoNa("loessN1","SLDCases")%>%
  ggplot(aes(x=Date))+
  geom_line(aes(y=MinMaxNormalization(loessN1), color="loessN1" , info = loessN1))+
  geom_line(aes(y=MinMaxNormalization(SLDCases), color="SLDCases" , info = SLDCases))+
  facet_wrap(~Site)+
  labs(y="variable min max normalized")+
  ColorRule


ggplotly(SLDLoessGraphic,tooltip=c("info","Date"))

The loess smoothing roughly doubled the correlation at all time lags. The two granger tests p value also got signifigently smaller

TestDF4 <- SLDSmoothedDF%>%
  FillNA("loessN1","SLDCases")%>%#filling in missing data in range with previous values
  NoNa("loessN1","SLDCases")#removing rows from before both series started

SLDCases <- TestDF4$SLDCases
N1 <- TestDF4$loessN1

ccf(SLDCases,N1,na.action=na.pass)

grangertest(SLDCases, N1, order = 1)
## Granger causality test
## 
## Model 1: N1 ~ Lags(N1, 1:1) + Lags(SLDCases, 1:1)
## Model 2: N1 ~ Lags(N1, 1:1)
##   Res.Df Df      F    Pr(>F)    
## 1    279                        
## 2    280 -1 89.183 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
grangertest(N1,SLDCases, order = 1)
## Granger causality test
## 
## Model 1: SLDCases ~ Lags(SLDCases, 1:1) + Lags(N1, 1:1)
## Model 2: SLDCases ~ Lags(SLDCases, 1:1)
##   Res.Df Df      F    Pr(>F)    
## 1    279                        
## 2    280 -1 69.758 3.177e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1